In [1]:
import os
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats.kde import gaussian_kde
from numpy import linspace
import seaborn as sns
import gzip
import HTSeq
from functools import reduce
from matplotlib_venn import venn2
from matplotlib_venn import venn3

Define input paths

In [2]:
results_path = "/scicore/home/zavolan/gypas/projects/EXTERNAL/Evan_NMD/long_reads_alternative_analysis/results/comparison/"

# splicing
SE_path_dKD = os.path.join(results_path, "comparison_scr_4_5_6_dKD/rmats/SE.MATS.JCEC.txt")
SE_path_SMG6 = os.path.join(results_path, "comparison_scr_1_2_3_SMG6KD/rmats/SE.MATS.JCEC.txt")
SE_path_SMG7 = os.path.join(results_path, "comparison_scr_4_5_6_SMG7KD/rmats/SE.MATS.JCEC.txt")
SE_path_UPF1 = os.path.join(results_path, "comparison_scr_1_2_3_UPF1KD/rmats/SE.MATS.JCEC.txt")

SE_path_SMG6KD_SMG6rescue = os.path.join(results_path, "comparison_SMG6KD_SMG6rescue/rmats/SE.MATS.JCEC.txt")
SE_path_SMG7KD_SMG7rescue = os.path.join(results_path, "comparison_SMG7KD_SMG7rescue/rmats/SE.MATS.JCEC.txt")
SE_path_UPF1KD_UPF1rescue = os.path.join(results_path, "comparison_UPF1KD_UPF1rescue/rmats/SE.MATS.JCEC.txt")

SE_path_dKD_dKDSMG6rescue = os.path.join(results_path, "comparison_dKD_dKDSMG6rescue/rmats/SE.MATS.JCEC.txt")
SE_path_dKD_dKDSMG7rescue = os.path.join(results_path, "comparison_dKD_dKDSMG7rescue/rmats/SE.MATS.JCEC.txt")

Custom functions

In [3]:
def mats_prepare_events_scr_vs_KD(events_path):
    
    """Prepare events for rMATS"""
    
    events = pd.read_csv(events_path, header=0, sep="\t")
    
    # calculate mean expression levels
    IncLevel1_mean = pd.DataFrame(events.IncLevel1.str.split(',',2).tolist(),
                              columns=['IncLevel1_1', 'IncLevel1_2', 'IncLevel1_3'])
    IncLevel2_mean = pd.DataFrame(events.IncLevel2.str.split(',',2).tolist(),
                              columns=['IncLevel2_1', 'IncLevel2_2', 'IncLevel2_3'])
    IncLevel1_mean['IncLevel1_1'] = pd.to_numeric(IncLevel1_mean['IncLevel1_1'], errors='coerce')
    IncLevel1_mean['IncLevel1_2'] = pd.to_numeric(IncLevel1_mean['IncLevel1_2'], errors='coerce')
    IncLevel1_mean['IncLevel1_3'] = pd.to_numeric(IncLevel1_mean['IncLevel1_3'], errors='coerce')
    IncLevel2_mean['IncLevel2_1'] = pd.to_numeric(IncLevel2_mean['IncLevel2_1'], errors='coerce')
    IncLevel2_mean['IncLevel2_2'] = pd.to_numeric(IncLevel2_mean['IncLevel2_2'], errors='coerce')
    IncLevel2_mean['IncLevel2_3'] = pd.to_numeric(IncLevel2_mean['IncLevel2_3'], errors='coerce')
    
    events['Mean PSI Scr'] = np.mean(IncLevel1_mean, axis=1)
    events['Mean PSI KD'] = np.mean(IncLevel2_mean, axis=1)
    events['Significant'] = 'No'
    events.loc[events["FDR"]<0.05, 'Significant'] = 'FDR<0.05'
    events.loc[((events["FDR"]<0.05) & (abs(events["IncLevelDifference"])>0.3)), 'Significant'] = 'FDR<0.05 & abs(IncLevelDifference)>0.3'
        
    return events
In [4]:
def mats_prepare_events_KD_vs_rescue(events_path):
    
    """Prepare events for rMATS"""
    
    events = pd.read_csv(events_path, header=0, sep="\t")
    
    # calculate mean expression levels
    IncLevel1_mean = pd.DataFrame(events.IncLevel1.str.split(',',2).tolist(),
                              columns=['IncLevel1_1', 'IncLevel1_2', 'IncLevel1_3'])
    IncLevel2_mean = pd.DataFrame(events.IncLevel2.str.split(',',2).tolist(),
                              columns=['IncLevel2_1', 'IncLevel2_2', 'IncLevel2_3'])
    IncLevel1_mean['IncLevel1_1'] = pd.to_numeric(IncLevel1_mean['IncLevel1_1'], errors='coerce')
    IncLevel1_mean['IncLevel1_2'] = pd.to_numeric(IncLevel1_mean['IncLevel1_2'], errors='coerce')
    IncLevel1_mean['IncLevel1_3'] = pd.to_numeric(IncLevel1_mean['IncLevel1_3'], errors='coerce')
    IncLevel2_mean['IncLevel2_1'] = pd.to_numeric(IncLevel2_mean['IncLevel2_1'], errors='coerce')
    IncLevel2_mean['IncLevel2_2'] = pd.to_numeric(IncLevel2_mean['IncLevel2_2'], errors='coerce')
    IncLevel2_mean['IncLevel2_3'] = pd.to_numeric(IncLevel2_mean['IncLevel2_3'], errors='coerce')
    
    events['Mean PSI KD'] = np.mean(IncLevel1_mean, axis=1)
    events['Mean PSI rescue'] = np.mean(IncLevel2_mean, axis=1)
    events['Significant'] = 'No'
    events.loc[events["FDR"]<0.05, 'Significant'] = 'FDR<0.05'
    events.loc[((events["FDR"]<0.05) & (abs(events["IncLevelDifference"])>0.3)), 'Significant'] = 'FDR<0.05 & abs(IncLevelDifference)>0.3'
    
    return events

PART A: Identify NMD targets

In [5]:
path = SE_path_dKD
SE_MATS_dKD = mats_prepare_events_scr_vs_KD(path)
SE_MATS_dKD_short = SE_MATS_dKD.set_index(["GeneID", "chr", "strand", "exonStart_0base", "exonEnd", "upstreamES", "upstreamEE", "downstreamES", "downstreamEE"])[["Mean PSI KD", "FDR", "IncLevelDifference"]]
SE_MATS_dKD_short.columns = SE_MATS_dKD_short.columns + "_dKD"
SE_MATS_dKD_short.reset_index(inplace=True)

path = SE_path_SMG6
SE_MATS_SMG6 = mats_prepare_events_scr_vs_KD(path)
SE_MATS_SMG6_short = SE_MATS_SMG6.set_index(["GeneID", "chr", "strand", "exonStart_0base", "exonEnd", "upstreamES", "upstreamEE", "downstreamES", "downstreamEE"])[["Mean PSI KD", "FDR", "IncLevelDifference"]]
SE_MATS_SMG6_short.columns = SE_MATS_SMG6_short.columns + "_SMG6"
SE_MATS_SMG6_short.reset_index(inplace=True)

path = SE_path_SMG7
SE_MATS_SMG7 = mats_prepare_events_scr_vs_KD(path)
SE_MATS_SMG7_short = SE_MATS_SMG7.set_index(["GeneID", "chr", "strand", "exonStart_0base", "exonEnd", "upstreamES", "upstreamEE", "downstreamES", "downstreamEE"])[["Mean PSI KD", "FDR", "IncLevelDifference"]]
SE_MATS_SMG7_short.columns = SE_MATS_SMG7_short.columns + "_SMG7"
SE_MATS_SMG7_short.reset_index(inplace=True)

path = SE_path_UPF1
SE_MATS_UPF1 = mats_prepare_events_scr_vs_KD(path)
SE_MATS_UPF1_short = SE_MATS_UPF1.set_index(["GeneID", "chr", "strand", "exonStart_0base", "exonEnd", "upstreamES", "upstreamEE", "downstreamES", "downstreamEE"])[["Mean PSI KD", "FDR", "IncLevelDifference"]]
SE_MATS_UPF1_short.columns = SE_MATS_UPF1_short.columns + "_UPF1"
SE_MATS_UPF1_short.reset_index(inplace=True)

path = SE_path_SMG6KD_SMG6rescue
SE_MATS_SMG6KD_SMG6rescue = mats_prepare_events_KD_vs_rescue(path)
SE_MATS_SMG6KD_SMG6rescue_short = SE_MATS_SMG6KD_SMG6rescue.set_index(["GeneID", "chr", "strand", "exonStart_0base", "exonEnd", "upstreamES", "upstreamEE", "downstreamES", "downstreamEE"])[["Mean PSI rescue", "FDR", "IncLevelDifference"]]
SE_MATS_SMG6KD_SMG6rescue_short.columns = SE_MATS_SMG6KD_SMG6rescue_short.columns + "_SMG6rescue"
SE_MATS_SMG6KD_SMG6rescue_short.reset_index(inplace=True)

path = SE_path_SMG7KD_SMG7rescue
SE_MATS_SMG7KD_SMG7rescue = mats_prepare_events_KD_vs_rescue(path)
SE_MATS_SMG7KD_SMG7rescue_short = SE_MATS_SMG7KD_SMG7rescue.set_index(["GeneID", "chr", "strand", "exonStart_0base", "exonEnd", "upstreamES", "upstreamEE", "downstreamES", "downstreamEE"])[["Mean PSI rescue", "FDR", "IncLevelDifference"]]
SE_MATS_SMG7KD_SMG7rescue_short.columns = SE_MATS_SMG7KD_SMG7rescue_short.columns + "_SMG7rescue"
SE_MATS_SMG7KD_SMG7rescue_short.reset_index(inplace=True)

path = SE_path_UPF1KD_UPF1rescue
SE_MATS_UPF1KD_UPF1rescue = mats_prepare_events_KD_vs_rescue(path)
SE_MATS_UPF1KD_UPF1rescue_short = SE_MATS_UPF1KD_UPF1rescue.set_index(["GeneID", "chr", "strand", "exonStart_0base", "exonEnd", "upstreamES", "upstreamEE", "downstreamES", "downstreamEE"])[["Mean PSI rescue", "FDR", "IncLevelDifference"]]
SE_MATS_UPF1KD_UPF1rescue_short.columns = SE_MATS_UPF1KD_UPF1rescue_short.columns + "_UPF1rescue"
SE_MATS_UPF1KD_UPF1rescue_short.reset_index(inplace=True)

path = SE_path_dKD_dKDSMG6rescue
SE_MATS_dKD_dKDSMG6rescue = mats_prepare_events_KD_vs_rescue(path)
SE_MATS_dKD_dKDSMG6rescue_short = SE_MATS_dKD_dKDSMG6rescue.set_index(["GeneID", "chr", "strand", "exonStart_0base", "exonEnd", "upstreamES", "upstreamEE", "downstreamES", "downstreamEE"])[["Mean PSI rescue", "FDR", "IncLevelDifference"]]
SE_MATS_dKD_dKDSMG6rescue_short.columns = SE_MATS_dKD_dKDSMG6rescue_short.columns + "_dKDSMG6rescue"
SE_MATS_dKD_dKDSMG6rescue_short.reset_index(inplace=True)

path = SE_path_dKD_dKDSMG7rescue
SE_MATS_dKD_dKDSMG7rescue = mats_prepare_events_KD_vs_rescue(path)
SE_MATS_dKD_dKDSMG7rescue_short = SE_MATS_dKD_dKDSMG7rescue.set_index(["GeneID", "chr", "strand", "exonStart_0base", "exonEnd", "upstreamES", "upstreamEE", "downstreamES", "downstreamEE"])[["Mean PSI rescue", "FDR", "IncLevelDifference"]]
SE_MATS_dKD_dKDSMG7rescue_short.columns = SE_MATS_dKD_dKDSMG7rescue_short.columns + "_dKDSMG7rescue"
SE_MATS_dKD_dKDSMG7rescue_short.reset_index(inplace=True)
In [6]:
SE_MATS_dKD_short.head()
Out[6]:
GeneID chr strand exonStart_0base exonEnd upstreamES upstreamEE downstreamES downstreamEE Mean PSI KD_dKD FDR_dKD IncLevelDifference_dKD
0 MSTRG.30627 chr7 - 92234448 92234592 92226525 92226682 92234807 92234923 0.944000 0.003586 0.053
1 MSTRG.30627 chr7 - 92244001 92244149 92242033 92242137 92244901 92245134 0.756667 0.013994 0.126
2 MSTRG.30627 chr7 - 92244901 92245171 92242033 92242137 92245426 92245595 0.720333 1.000000 0.017
3 MSTRG.30627 chr7 - 92244901 92245171 92242033 92242137 92245464 92245914 0.910333 0.458501 0.036
4 MSTRG.30627 chr7 - 92244901 92245171 92242033 92242137 92245905 92245943 0.979000 0.439880 -0.019

merge all

In [7]:
dfs = [SE_MATS_dKD_short,
       SE_MATS_SMG6_short,
       SE_MATS_SMG7_short,
       SE_MATS_UPF1_short,
       SE_MATS_SMG6KD_SMG6rescue_short,
       SE_MATS_SMG7KD_SMG7rescue_short,
       SE_MATS_UPF1KD_UPF1rescue_short,
       SE_MATS_dKD_dKDSMG6rescue_short,
       SE_MATS_dKD_dKDSMG7rescue_short
      ]

df_all = reduce(lambda left,right: pd.merge(left,right,on=["GeneID", "chr", "strand", "exonStart_0base", "exonEnd", "upstreamES", "upstreamEE", "downstreamES", "downstreamEE"]), dfs)
In [8]:
df_all.head()
Out[8]:
GeneID chr strand exonStart_0base exonEnd upstreamES upstreamEE downstreamES downstreamEE Mean PSI KD_dKD ... IncLevelDifference_SMG7rescue Mean PSI rescue_UPF1rescue FDR_UPF1rescue IncLevelDifference_UPF1rescue Mean PSI rescue_dKDSMG6rescue FDR_dKDSMG6rescue IncLevelDifference_dKDSMG6rescue Mean PSI rescue_dKDSMG7rescue FDR_dKDSMG7rescue IncLevelDifference_dKDSMG7rescue
0 MSTRG.30627 chr7 - 92244001 92244149 92242033 92242137 92244901 92245134 0.756667 ... -0.023 0.805667 1.000000 -0.015 0.869333 0.428754 -0.113 0.796333 0.917470 -0.040
1 MSTRG.30627 chr7 - 92244901 92245171 92242033 92242137 92245426 92245595 0.720333 ... 0.024 0.695000 1.000000 0.018 0.792667 0.712554 -0.072 0.776333 0.654918 -0.056
2 MSTRG.30627 chr7 - 92244901 92245171 92242033 92242137 92245464 92245914 0.910333 ... -0.072 0.920000 0.980229 0.018 0.939000 1.000000 -0.029 0.938667 0.971221 -0.028
3 MSTRG.30627 chr7 - 92244901 92245171 92242033 92242137 92245905 92245943 0.979000 ... 0.004 0.979000 1.000000 -0.008 0.981667 1.000000 -0.003 0.982333 1.000000 -0.003
4 MSTRG.30627 chr7 - 92245426 92245595 92242033 92242137 92245905 92246059 0.942667 ... -0.007 0.947333 1.000000 -0.008 0.958667 1.000000 -0.016 0.961000 1.000000 -0.018

5 rows × 36 columns

dKD - dKDSMG6rescue

In [9]:
df_all["Significant"] = "No"
df_all.loc[(df_all["FDR_dKD"]<0.05) & (df_all["FDR_dKDSMG6rescue"]<0.05), "Significant"] = "NMD targets"

sns.set(font_scale=2)
ax = sns.lmplot(x='IncLevelDifference_dKD',
           y='IncLevelDifference_dKDSMG6rescue',
           hue='Significant',
           data=df_all,
           fit_reg=False,
           legend=True,
           palette=["gray", "red"],
           hue_order = ["No", "NMD targets"],
           height=15,
           aspect=1
)
ax = plt.gca()
ax.set_title("deltaPSI for Skipping Exons (SE)")
Out[9]:
Text(0.5, 1.0, 'deltaPSI for Skipping Exons (SE)')

dKD - dKDSMG7rescue

In [10]:
df_all["Significant"] = "No"
df_all.loc[(df_all["FDR_dKD"]<0.05) & (df_all["FDR_dKDSMG7rescue"]<0.05), "Significant"] = "NMD targets"

sns.set(font_scale=2)
ax = sns.lmplot(x='IncLevelDifference_dKD',
           y='IncLevelDifference_dKDSMG7rescue',
           hue='Significant',
           data=df_all,
           fit_reg=False,
           legend=True,
           palette=["gray", "red"],
           hue_order = ["No", "NMD targets"],
           height=15,
           aspect=1
)
ax = plt.gca()
ax.set_title("deltaPSI for Skipping Exons (SE)")
Out[10]:
Text(0.5, 1.0, 'deltaPSI for Skipping Exons (SE)')

Collect all NMD targets

In [11]:
df_all.reset_index(inplace=True)
In [12]:
df_all["event_id"] = df_all["GeneID"] + "_" + \
                     df_all["chr"] + "_" + \
                     df_all["strand"] + "_" + \
                     df_all["exonStart_0base"].map(str) + "_" + \
                     df_all["exonEnd"].map(str) + "_" + \
                     df_all["upstreamES"].map(str) + "_" + \
                     df_all["upstreamEE"].map(str) + "_" + \
                     df_all["downstreamES"].map(str) + "_" + \
                     df_all["downstreamEE"].map(str)
In [13]:
NMD_targets_SMG7 = df_all[(df_all["FDR_dKD"]<0.05) & (df_all["FDR_dKDSMG7rescue"]<0.05)]["event_id"].tolist()
In [14]:
NMD_targets_SMG6 = df_all[(df_all["FDR_dKD"]<0.05) & (df_all["FDR_dKDSMG6rescue"]<0.05)]["event_id"].tolist()
In [15]:
len(NMD_targets_SMG7), len(NMD_targets_SMG6)
Out[15]:
(2593, 6265)
In [16]:
NMD_targets = list(set(NMD_targets_SMG7 + NMD_targets_SMG6))
In [17]:
len(NMD_targets)
Out[17]:
6531
In [18]:
NMD_targets_genes = []
NMD_targets_exons = []
for target in NMD_targets:
    target_sp = target.strip().split("_")
    NMD_targets_genes.append(target_sp[0])
    NMD_targets_exons.append(target_sp[1] + ":" + str(target_sp[3]) + "-" + str(target_sp[4]) + ":" + target_sp[2])
NMD_targets_genes = list(set(NMD_targets_genes))
NMD_targets_exons = list(set(NMD_targets_exons))
In [19]:
len(NMD_targets_genes), len(NMD_targets_exons)
Out[19]:
(3579, 6317)

PART B: Splicing level NMD targets

In [20]:
path = SE_path_dKD
SE_MATS_JCEC = mats_prepare_events_scr_vs_KD(path)
SE_MATS_JCEC["event_id"] = SE_MATS_JCEC["GeneID"] + "_" + \
                           SE_MATS_JCEC["chr"] + "_" + \
                           SE_MATS_JCEC["strand"] + "_" + \
                           SE_MATS_JCEC["exonStart_0base"].map(str) + "_" + \
                           SE_MATS_JCEC["exonEnd"].map(str) + "_" + \
                           SE_MATS_JCEC["upstreamES"].map(str) + "_" + \
                           SE_MATS_JCEC["upstreamEE"].map(str) + "_" + \
                           SE_MATS_JCEC["downstreamES"].map(str) + "_" + \
                           SE_MATS_JCEC["downstreamEE"].map(str)
In [21]:
SE_MATS_JCEC["Significant"] = "No"
In [22]:
sns.set(font_scale=2)
ax = sns.lmplot(x='Mean PSI Scr',
           y='Mean PSI KD',
           hue='Significant',
           data=SE_MATS_JCEC,
           fit_reg=False,
           legend=True,
           palette=["gray"],
           hue_order = ["No"],
           height=15,
           aspect=1
)
ax = plt.gca()
ax.set_title("Mean PSI for Skipping Exons (SE)")
Out[22]:
Text(0.5, 1.0, 'Mean PSI for Skipping Exons (SE)')
In [23]:
SE_MATS_JCEC.loc[SE_MATS_JCEC["FDR"]<0.05, 'Significant'] = 'FDR<0.05'
In [24]:
sns.set(font_scale=2)
ax = sns.lmplot(x='Mean PSI Scr',
           y='Mean PSI KD',
           hue='Significant',
           data=SE_MATS_JCEC,
           fit_reg=False,
           legend=True,
           palette=["gray", "black"],
           hue_order = ["No", "FDR<0.05"],
           height=15,
           aspect=1
)
ax = plt.gca()
ax.set_title("Mean PSI for Skipping Exons (SE)")
Out[24]:
Text(0.5, 1.0, 'Mean PSI for Skipping Exons (SE)')
In [25]:
SE_MATS_JCEC["MND_target"] = "No"
SE_MATS_JCEC.loc[SE_MATS_JCEC["event_id"].isin(NMD_targets), "MND_target"] = "Yes"
In [26]:
sns.set(font_scale=2)
ax = sns.lmplot(x='Mean PSI Scr',
           y='Mean PSI KD',
           hue='MND_target',
           data=SE_MATS_JCEC,
           fit_reg=False,
           legend=True,
           palette=["gray", "black"],
           hue_order = ["No", "Yes"],
           height=15,
           aspect=1
)
ax = plt.gca()
ax.set_title("Mean PSI for Skipping Exons (SE)")
Out[26]:
Text(0.5, 1.0, 'Mean PSI for Skipping Exons (SE)')
In [27]:
SE_MATS_JCEC.loc[(SE_MATS_JCEC["FDR"]<0.05) & (SE_MATS_JCEC["MND_target"]=="Yes"), "Significant"] = "FDR<0.05 &\nNMD target"
In [28]:
sns.set(font_scale=2)
ax = sns.lmplot(x='Mean PSI Scr',
           y='Mean PSI KD',
           hue='Significant',
           data=SE_MATS_JCEC,
           fit_reg=False,
           legend=True,
           palette=["gray", "black", "red"],
           hue_order = ["No", "FDR<0.05", "FDR<0.05 &\nNMD target"],
           height=15,
           aspect=1
)
ax = plt.gca()
ax.set_title("Mean PSI for Skipping Exons (SE)")
Out[28]:
Text(0.5, 1.0, 'Mean PSI for Skipping Exons (SE)')
In [29]:
barplot_stats = pd.DataFrame(SE_MATS_JCEC["Significant"].value_counts())
barplot_stats
Out[29]:
Significant
No 22215
FDR<0.05 &\nNMD target 6531
FDR<0.05 6054

PART C: Splicing level NOVEL targets

In [30]:
bed_novel = pd.read_csv(
    "/scicore/home/zavolan/gypas/projects/EXTERNAL/Evan_NMD/long_reads_alternative_analysis/results/comparison/comparison_scr_4_5_6_dKD/rmats_processed/SE.MATS.JCEC.novel.bed",
    header=None,
    sep="\t"
)
bed_novel.columns = ["chr", "exonStart_0base", "exonEnd", "N", "score", "strand"]
bed_novel["id"] = "chr" + bed_novel["chr"] + ":" + bed_novel["exonStart_0base"].astype(str) + "-" + bed_novel["exonEnd"].astype(str) + ":" + bed_novel["strand"].astype(str)
bed_novel_ids = bed_novel["id"].tolist()
len(bed_novel_ids)
Out[30]:
2180
In [31]:
bed_novel_ids = list(set(bed_novel_ids))
len(bed_novel_ids)
Out[31]:
2103
In [32]:
path = SE_path_dKD
SE_MATS_JCEC = mats_prepare_events_scr_vs_KD(path)
SE_MATS_JCEC["event_id"] = SE_MATS_JCEC["GeneID"] + "_" + \
                           SE_MATS_JCEC["chr"] + "_" + \
                           SE_MATS_JCEC["strand"] + "_" + \
                           SE_MATS_JCEC["exonStart_0base"].map(str) + "_" + \
                           SE_MATS_JCEC["exonEnd"].map(str) + "_" + \
                           SE_MATS_JCEC["upstreamES"].map(str) + "_" + \
                           SE_MATS_JCEC["upstreamEE"].map(str) + "_" + \
                           SE_MATS_JCEC["downstreamES"].map(str) + "_" + \
                           SE_MATS_JCEC["downstreamEE"].map(str)
In [33]:
SE_MATS_JCEC["Significant"] = "No"
SE_MATS_JCEC.loc[SE_MATS_JCEC["FDR"]<0.05, 'Significant'] = 'FDR<0.05'
In [34]:
sns.set(font_scale=2)
ax = sns.lmplot(x='Mean PSI Scr',
           y='Mean PSI KD',
           hue='Significant',
           data=SE_MATS_JCEC,
           fit_reg=False,
           legend=True,
           palette=["gray", "black"],
           hue_order = ["No", "FDR<0.05"],
           height=15,
           aspect=1
)
ax = plt.gca()
ax.set_title("Mean PSI for Skipping Exons (SE)")
Out[34]:
Text(0.5, 1.0, 'Mean PSI for Skipping Exons (SE)')
In [35]:
SE_MATS_JCEC["exon"] = SE_MATS_JCEC["chr"] + ":" + SE_MATS_JCEC["exonStart_0base"].map(str) + "-" + SE_MATS_JCEC["exonEnd"].map(str) + ":" + SE_MATS_JCEC["strand"]
In [36]:
SE_MATS_JCEC.loc[(SE_MATS_JCEC["exon"].isin(bed_novel_ids)) & (SE_MATS_JCEC["FDR"]<0.05), "Significant"] = "FDR<0.05 &\nnovel exon"
In [37]:
sns.set(font_scale=2)
ax = sns.lmplot(x='Mean PSI Scr',
           y='Mean PSI KD',
           hue='Significant',
           data=SE_MATS_JCEC,
           fit_reg=False,
           legend=True,
           palette=["gray", "black", "red"],
           hue_order = ["No", "FDR<0.05", "FDR<0.05 &\nnovel exon"],
           height=15,
           aspect=1
)
ax = plt.gca()
ax.set_title("Mean PSI for Skipping Exons (SE)")
Out[37]:
Text(0.5, 1.0, 'Mean PSI for Skipping Exons (SE)')
In [38]:
SE_MATS_JCEC.loc[(SE_MATS_JCEC["exon"].isin(bed_novel_ids)) & \
                 (SE_MATS_JCEC["FDR"]<0.05) & \
                 (SE_MATS_JCEC["exon"].isin(NMD_targets_exons)), "Significant"] = "FDR<0.05 &\nnovel exon &\nNMD target"
In [39]:
sns.set(font_scale=2)
ax = sns.lmplot(x='Mean PSI Scr',
           y='Mean PSI KD',
           hue='Significant',
           data=SE_MATS_JCEC,
           fit_reg=False,
           legend=True,
           palette=["gray", "black", "red", "yellow"],
           hue_order = ["No", "FDR<0.05", "FDR<0.05 &\nnovel exon", "FDR<0.05 &\nnovel exon &\nNMD target"],
           height=15,
           aspect=1
)
ax = plt.gca()
ax.set_title("Mean PSI for Skipping Exons (SE)")
Out[39]:
Text(0.5, 1.0, 'Mean PSI for Skipping Exons (SE)')
In [40]:
barplot_stats = pd.DataFrame(SE_MATS_JCEC["Significant"].value_counts())
barplot_stats
Out[40]:
Significant
No 22215
FDR<0.05 11233
FDR<0.05 &\nnovel exon &\nNMD target 924
FDR<0.05 &\nnovel exon 428

PART D: Gene level

In [41]:
path = SE_path_dKD
SE_MATS_JCEC = mats_prepare_events_scr_vs_KD(path)

gene_ids_se = list(set(SE_MATS_JCEC[(SE_MATS_JCEC["FDR"] < 0.05)]["GeneID"].tolist()))

# gene expression
DE_path = os.path.join(results_path, "comparison_scr_4_5_6_dKD/DE_edgeR/final_table.tsv")
In [42]:
DE = pd.read_csv(DE_path, sep="\t", header=0)
DE["Significant"] = "No"
DE.loc[DE["FDR"]<0.05, "Significant"] = "FDR<0.05"
DE.head()
Out[42]:
id logFC logCPM LR PValue FDR Significant
0 ENSG00000000005 -2.780532 -4.964222 2.936239 0.086612 0.289845 No
1 ENSG00000000938 3.260032 -3.067405 18.447435 0.000017 0.000158 FDR<0.05
2 ENSG00000001561 0.000000 -5.189132 0.000000 1.000000 1.000000 No
3 ENSG00000002587 1.608965 -4.076697 1.966101 0.160862 0.457482 No
4 ENSG00000002726 -5.133521 -4.183622 12.893778 0.000330 0.002332 FDR<0.05
In [43]:
DE.Significant.value_counts()
Out[43]:
No          46912
FDR<0.05    12276
Name: Significant, dtype: int64
In [44]:
sns.set(font_scale=2)
ax = sns.lmplot(x='logCPM',
                y='logFC',
                hue='Significant',
                data=DE,
                fit_reg=False,
                legend=True,
                palette=["gray", "black"],
                hue_order = ["No", "FDR<0.05"],
                height=15,
                aspect=1
)
In [45]:
DE['Significant merged'] = 'No'
DE.loc[DE["FDR"]<0.05, "Significant merged"] = "FDR<0.05"
DE.loc[(DE["id"].isin(gene_ids_se)) & (DE["Significant"]=="No"), 'Significant merged'] = 'Splicing event, no gene change'
DE.loc[(DE["id"].isin(gene_ids_se)) & (DE["Significant"]=="FDR<0.05"), 'Significant merged'] = 'Splicing event, gene change'
In [46]:
sns.set(font_scale=2)
ax = sns.lmplot(x='logCPM',
           y='logFC',
           hue='Significant merged',
           data=DE,
           fit_reg=False,
           legend=True,
           palette=["gray", "black", "yellow", "red"],
           hue_order = ["No", "FDR<0.05",
                        "Splicing event, no gene change",
                        "Splicing event, gene change"],
            height=15,
            aspect=1
)
In [47]:
barplot_stats = pd.DataFrame(DE["Significant merged"].value_counts())
barplot_stats
Out[47]:
Significant merged
No 44782
FDR<0.05 8825
Splicing event, gene change 3451
Splicing event, no gene change 2130
In [48]:
path = SE_path_dKD
SE_MATS_JCEC = mats_prepare_events_scr_vs_KD(path)

gene_ids_se = list(set(SE_MATS_JCEC[(SE_MATS_JCEC["FDR"] < 0.05)]["GeneID"].tolist()))

DE = pd.read_csv(DE_path, sep="\t", header=0)
DE["Significant"] = "No"
DE.loc[DE["FDR"]<0.05, "Significant"] = "FDR<0.05"
DE.head()
Out[48]:
id logFC logCPM LR PValue FDR Significant
0 ENSG00000000005 -2.780532 -4.964222 2.936239 0.086612 0.289845 No
1 ENSG00000000938 3.260032 -3.067405 18.447435 0.000017 0.000158 FDR<0.05
2 ENSG00000001561 0.000000 -5.189132 0.000000 1.000000 1.000000 No
3 ENSG00000002587 1.608965 -4.076697 1.966101 0.160862 0.457482 No
4 ENSG00000002726 -5.133521 -4.183622 12.893778 0.000330 0.002332 FDR<0.05
In [49]:
DE["NMD_target"] = "No"
DE.loc[DE["id"].isin(NMD_targets_genes), "NMD_target"] = "Yes"
In [50]:
DE["Significant merged"] = "No"
DE.loc[DE["FDR"]<0.05, "Significant merged"] = "FDR<0.05"
DE.loc[(DE["id"].isin(NMD_targets_genes)) & (DE["Significant"]=="No"), 'Significant merged'] = 'NMD target, no gene change'
DE.loc[(DE["id"].isin(NMD_targets_genes)) & (DE["Significant"]=="FDR<0.05"), 'Significant merged'] = 'NMD target, gene change'
In [51]:
sns.set(font_scale=2)
ax = sns.lmplot(x='logCPM',
           y='logFC',
           hue='Significant merged',
           data=DE,
           fit_reg=False,
           legend=True,
           palette=["gray", "black", "yellow", "red"],
           hue_order = ["No", "FDR<0.05",
                        "NMD target, no gene change",
                        "NMD target, gene change"],
            height=15,
            aspect=1
)
In [52]:
barplot_stats = pd.DataFrame(DE["Significant merged"].value_counts())
barplot_stats
Out[52]:
Significant merged
No 45527
FDR<0.05 10082
NMD target, gene change 2194
NMD target, no gene change 1385